We use the svm function of the e1071 library. The relevant args are kernel and cost (for tolerating margin errors; higher values narrow margins).
An annoying blue outlier prevents a linear separation
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(rlang)
##
## Attaching package: 'rlang'
## The following objects are masked from 'package:purrr':
##
## %@%, as_function, flatten, flatten_chr, flatten_dbl, flatten_int,
## flatten_lgl, flatten_raw, invoke, list_along, modify, prepend,
## splice
set.seed(1)
X <- matrix(rnorm(20*2), ncol = 2)
Y <- c(rep(-1, 10), rep(1,10))
X[Y==1,] <- X[Y==1,] +1
X %>%
data.frame() %>%
mutate(y = Y) %>%
ggplot(aes(X1, X2, col = factor(y))) +
geom_point()
Now use SVM. Factor coercion is necessary if we don’t want SVM regression
library(e1071)
dat <- tibble(X1 = X[,1], X2 = X[,2], Y = factor(Y))
svm1 <- svm(Y~.,data = dat, kernel = "linear", cost =10, scale = FALSE)
svm1
##
## Call:
## svm(formula = Y ~ ., data = dat, kernel = "linear", cost = 10, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
##
## Number of Support Vectors: 7
Using base plotting, the support vectors are circles and the other observations crosses. Index shows which are which.
plot(svm1, dat)
dat[svm1$index,]
## # A tibble: 7 x 3
## X1 X2 Y
## <dbl> <dbl> <fct>
## 1 -0.626 0.919 -1
## 2 0.184 0.782 -1
## 3 0.330 0.620 -1
## 4 0.487 -0.156 -1
## 5 -1.21 0.946 1
## 6 0.955 0.585 1
## 7 0.984 0.606 1
summary(svm1)
##
## Call:
## svm(formula = Y ~ ., data = dat, kernel = "linear", cost = 10, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
##
## Number of Support Vectors: 7
##
## ( 4 3 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
If we try a smaller cost, we get a much bigger margin and many more support vectors. We can’t get the margin or the hyperplane equation directly, unfortunately
svm2 <- svm(Y~.,data = dat, kernel = "linear", cost =.1, scale = FALSE)
svm2
##
## Call:
## svm(formula = Y ~ ., data = dat, kernel = "linear", cost = 0.1, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
##
## Number of Support Vectors: 16
plot(svm2, dat)
We can use tune to cross-validate. The best model has 16 support vectors and cost of .1, indicating a wide margin.
set.seed(1)
tuned <- tune(svm, Y~., data = dat, kernel="linear",
ranges = list(cost=c(.001,.01,.1,1,5,10,100)))
summary(tuned)$performances %>% arrange(error)
## cost error dispersion
## 1 1e-01 0.05 0.1581139
## 2 1e+00 0.15 0.2415229
## 3 5e+00 0.15 0.2415229
## 4 1e+01 0.15 0.2415229
## 5 1e+02 0.15 0.2415229
## 6 1e-03 0.55 0.4377975
## 7 1e-02 0.55 0.4377975
summary(tuned$best.model)
##
## Call:
## best.tune(method = svm, train.x = Y ~ ., data = dat, ranges = list(cost = c(0.001,
## 0.01, 0.1, 1, 5, 10, 100)), kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.1
##
## Number of Support Vectors: 16
##
## ( 8 8 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
best <- tuned$best.model
The predict method works as you’d expect:
set.seed(1)
X_test <- matrix(rnorm(20*2), ncol = 2)
Y_test <- sample(c(-1,1), 20, replace = TRUE)
X_test[Y_test==1] <- X_test[Y_test ==1] +1
test_dat <- tibble(X1 = X[,1], X2 = X[,2], Y = as.factor(Y_test))
ypred <- predict(best, test_dat)
table(ypred, truth = test_dat$Y)
## truth
## ypred -1 1
## -1 5 6
## 1 6 3
Should a separable hyperplane exist, svm() will find it. We ensure it does (now some X’es are shifted by the SD of both predictors)
X[Y==1,] <- X[Y==1,]+0.5
plot(X, col=2, pch=19)
Set cost much higher to ensure very thin margins. We see there are only three support vectors, and no training errors are made. This is probably overfit.
dat <- tibble(X1 = X[,1], X2 = X[,2], Y = as.factor(Y))
true_svm <- svm(Y~., data = dat, kernel = "linear", cost =1e5, scale = FALSE)
plot(true_svm, dat)
Obviously, just change the arg of kernel. The relevant params are degree and gamma
set.seed(1)
X <- matrix(rnorm(200*2), ncol =2)
X[1:100,] <- X[1:100,]+2
X[101:150,] <- X[101:150,] -2
Y <- c(rep(1, 150), rep(2, 50))
dat <- tibble(X1 = X[,1], X2 = X[,2], Y = as.factor(Y))
svm_call <- partial(svm, formula = Y~.,scale = FALSE, ... = )
plot(X, col = Y)
Now split up data and try a radial kernel. Too many training errors, so we should cross-validate \(\gamma\).
train <- sample(200, 100)
radial <- svm_call(kernel = "radial", gamma = 1, data = dat[train,], cost = 1)
summary(radial)
##
## Call:
## svm(formula = Y ~ ., data = ..3, kernel = "radial", gamma = 1, cost = 1,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 48
##
## ( 29 19 )
##
##
## Number of Classes: 2
##
## Levels:
## 1 2
It turns out a cost of 1 and \(\gamma\) of 2 optimize the fit:
radial_tuned <- tune(svm, Y~., data = dat[train,], kernel = "radial", ranges = list(cost = c(.1,1,20,100,1000), gamma = c(.5,1,2,3,4)), decision.values=TRUE)
summary(radial_tuned)$performance %>% arrange(error)
## cost gamma error dispersion
## 1 1e+00 0.5 0.06 0.08432740
## 2 1e+00 1.0 0.07 0.09486833
## 3 1e+00 2.0 0.08 0.11352924
## 4 2e+01 0.5 0.09 0.09944289
## 5 1e+03 1.0 0.09 0.09944289
## 6 1e+00 3.0 0.09 0.11005049
## 7 1e+00 4.0 0.09 0.12866839
## 8 1e+02 1.0 0.10 0.08164966
## 9 1e+02 0.5 0.10 0.09428090
## 10 2e+01 1.0 0.10 0.09428090
## 11 2e+01 2.0 0.10 0.10540926
## 12 2e+01 3.0 0.10 0.13333333
## 13 1e+03 0.5 0.12 0.10327956
## 14 2e+01 4.0 0.12 0.13984118
## 15 1e+02 2.0 0.12 0.13165612
## 16 1e+03 2.0 0.13 0.10593499
## 17 1e+02 3.0 0.13 0.10593499
## 18 1e+03 3.0 0.15 0.10801234
## 19 1e+02 4.0 0.15 0.13540064
## 20 1e+03 4.0 0.17 0.13374935
## 21 1e-01 1.0 0.18 0.16865481
## 22 1e-01 2.0 0.26 0.15776213
## 23 1e-01 3.0 0.27 0.13374935
## 24 1e-01 4.0 0.27 0.13374935
## 25 1e-01 0.5 0.27 0.14944341
rad <- summary(radial_tuned)$best.model
We use the ROCR package to produce ROC curves. (Recall an ROC curve plots true positive rate against false positive rate - something like an odds ratio at different classification threshold. A perfect model has an area under the curve of 1).
We can get predicted values from the model object. They all have low magnitude.
library(ROCR)
plot_roc <- function(pred, truth, ...){
predob <- prediction(pred, truth)
perf = performance(predob, "tpr", "fpr")
plot(perf)
}
rad <- svm_call(kernel = "radial", data = dat[train,], gamma =2, cost =1, decision.values = TRUE)
fitted <- rad$decision.values
plot_roc(fitted, dat[train, "Y"])
These plots are good for comparing different levels of \(\gamma\).
Non-binary classification is done automatically using the one-vs-one approach.
This is contained in Khan, a dataset of tissue samples from four types of tumor. It has training and test sets.
khan <- ISLR::Khan
map(list(khan$ytrain, khan$ytest), table)
## [[1]]
##
## 1 2 3 4
## 8 23 12 20
##
## [[2]]
##
## 1 2 3 4
## 3 6 6 5
This is a typical \(p>n\) dataset where a hyperplane is useful. We can just a linear kernel, and the model does perfectly! But that’s to be expected with this structure.
khan_train <- tibble(X = khan$xtrain, Y = as.factor(khan$ytrain))
mod_khan <- svm_call(data = khan_train, kernel = "linear", cost = 10)
summary(mod_khan)
##
## Call:
## svm(formula = Y ~ ., data = ..1, kernel = "linear", cost = 10, scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 10
##
## Number of Support Vectors: 54
##
## ( 18 20 9 7 )
##
##
## Number of Classes: 4
##
## Levels:
## 1 2 3 4
table(mod_khan$fitted, khan_train$Y)
##
## 1 2 3 4
## 1 8 0 0 0
## 2 0 23 0 0
## 3 0 0 12 0
## 4 0 0 0 20
But the model makes few errors on the test set:
khan_test <- tibble(X = khan$xtest, Y =as.factor(khan$ytest))
khan_pred <- predict(mod_khan, khan_test)
table(khan_pred, khan_test$Y)
##
## khan_pred 1 2 3 4
## 1 3 0 0 0
## 2 0 6 2 0
## 3 0 0 4 0
## 4 0 0 0 5
pred_accuracy <- function(tab){
sum(diag(tab))/sum(tab)
}
The first problem asks us to sketch some 2D hyperplanes
planes <- tibble(plane = c("1", "2"), slope = c(3, -.5), b_0 = c(-1, 2))
expand_grid(x = seq(-5, 5, by = .2), y= seq(-5, 5, by = .2)) %>%
mutate(p1 = 1 + 3*x -y >0, p2 = -2 + x +2*y >0) %>%
ggplot(aes(x = x, y = y, col = p1, stroke = p2))+
geom_point(size = 2, alpha = .5)+
geom_abline(data = planes, aes(slope = slope, intercept = b_0, color = plane))
We are asked to do the same for a polynomial hyperplane. It turns out to be an ellipse:
expand_grid(x = seq(-5, 5, by = .2), y= seq(-5, 5, by = .2)) %>%
mutate(Class = factor(x^2 +2*x + y^2 -4*y + 1 > 0, labels = c("red", "blue"))) %>%
ggplot(aes(x = x, y = y, col = Class)) +
geom_point(size = 2, alpha = .5)
And then to classify several points:
hyper <- expr((1 +x)^2 + (2-y)^2 <=4)
expand_grid(x = -5:5, y = -5:5) %>% list2env()
## <environment: 0x0000000021748ef8>
eval(hyper, env = expand_grid(x = -5:5, y = -5:5)) %>%
factor(labels = c("blue", "red"))
## [1] blue blue blue blue blue blue blue blue blue blue blue blue blue blue blue
## [16] blue blue blue blue blue blue blue blue blue blue blue blue blue blue red
## [31] blue blue blue blue blue blue blue blue blue red red red blue blue blue
## [46] blue blue blue blue red red red red red blue blue blue blue blue blue
## [61] blue red red red blue blue blue blue blue blue blue blue blue red blue
## [76] blue blue blue blue blue blue blue blue blue blue blue blue blue blue blue
## [91] blue blue blue blue blue blue blue blue blue blue blue blue blue blue blue
## [106] blue blue blue blue blue blue blue blue blue blue blue blue blue blue blue
## [121] blue
## Levels: blue red
We are asked to consider polynomial hyperplanes.
It is trivial to show the polynomial is a nonlinear transformation.
\[T(x,y)= \begin{bmatrix}x^2+2x\\y^2-4y\end{bmatrix}\] \[T(c(x,y)) = \begin{bmatrix}c^2x^2+2cx\\c^2y^2-4cy\end{bmatrix}\] \[cT(x,y) = \begin{bmatrix}cx^2+2cx\\cy^2-4cy\end{bmatrix}\] Bzzt. Nonlinear.
If instead we treat the the squared terms as vectors, the transformation is linear:
\[T_2(x^2, y^2, x, y)=\begin{bmatrix} x^2\\ 2x\\ y^2\\ -4y\end{bmatrix}\]
\[cT_2 =\begin{bmatrix} cx^2\\ c2x\\ cy^2\\ -4cy\end{bmatrix}\]
\[T_2c(x^2, y^2, x, y) =\begin{bmatrix} cx^2\\ c2x\\ cy^2\\ -4cy\end{bmatrix}\]
I’m too lazy to prove additive associativity as well but it’s pretty obvious.
We are now asked to consider the MMC for the following data. We see by plotting it cleanly separates observations 2 and 5 and 3 and 6:
dat2 <- tibble(obs = 1:7, x_1 = c(3,2,4,1,2,4,4),
x_2 = c(4,2,4,4,1,3,1),
class = as.factor(c("Red", "Red", "Red", "Red", "Blue", "Blue", "Blue")))
dat2 %>% ggplot(aes(x = x_1, y = x_2, col = class)) +
geom_point() +
geom_text(aes(label = obs), nudge_y = -.25, col = "black" ) +
scale_color_manual(values = c("blue", "red")) +
geom_abline(slope = 1, intercept = -0.5)
I inexplicably attempt linear regression on a categorical response variable
a_trans_a <- t(dat2[,-c(1,4)]) %*% as.matrix(dat2[-c(1,4)])
b <- ifelse(dat2$class == "Red", -1, 1)
as.matrix(dat2[,2:3]) %*% solve(a_trans_a) %*% t(dat2[,-c(1,4)]) %*% b
## [,1]
## [1,] -0.7391304
## [2,] -0.1739130
## [3,] -0.3478261
## [4,] -1.5217391
## [5,] 0.3043478
## [6,] 0.1304348
## [7,] 1.0869565
I crudely figure out the coefficients, then plot the line.
b_1 <- (dat2[3, "x_2"] - dat2[2, "x_2"]) /(dat2[3, "x_1"] -dat2[2, "x_1"])
b_2 <- -b_1
b_0 <- -(dat2[2, "x_2"] - dat2[5, "x_2"])/2
The equation comes to : \[0.5 -X_1 +X_2=0\] We see it correctly classifies. There are four support vectors: observations 2,5,3, and 6. Since observation 7 is not one of them, moving it would not impact the margin.
eqn <- expr(0.5 - x_1 +x_2)
fits <- eval(eqn, env = dat2)
cols <- ifelse(fits >0, "Red", "Blue")
Here we are asked to demonstrate that nonlinear kernels are superior when the decision boundary is nonlinear.
The polynomial model actually does worse. It misclassifed almost every -1 as 1. I squared the values of x1 for the 1 class, which must be the cause.
A standard polynomial model is no good, since \(x1\) is really a piecewise function. Maybe a radial one would work?
dat <- tibble(x1 = rnorm(100, sd = 2), x2 = rnorm(100, sd =1), Y = as.factor(c(rep(-1, 50), rep(1, 50)))) %>%
mutate(x1 = if_else(Y==1, x1^2, x1))
inds <- sample(100, 50, replace = FALSE)
train <- dat[inds,]
test <- dat[-inds,]
linear <- svm_call(kernel="linear", data = train)
poly <- svm_call(kernel = "polynomial", degree = 2, data = train)
summary(linear)
##
## Call:
## svm(formula = Y ~ ., data = ..2, kernel = "linear", scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 23
##
## ( 12 11 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
summary(poly)
##
## Call:
## svm(formula = Y ~ ., data = ..3, kernel = "polynomial", degree = 2,
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 2
## coef.0: 0
##
## Number of Support Vectors: 38
##
## ( 19 19 )
##
##
## Number of Classes: 2
##
## Levels:
## -1 1
lin_pred <- predict(linear, test)
poly_pred <- predict(poly, test)
table(lin_pred, test$Y)
##
## lin_pred -1 1
## -1 14 2
## 1 13 21
table(poly_pred, test$Y)
##
## poly_pred -1 1
## -1 23 11
## 1 4 12
A radial margin with big margins and low gamma does okay. None of these models can really handle the silly piecewise polynomial I created, though.
tune(svm, Y~., kernel = "radial", ranges = list(cost = c(.001, .01,.1,1,2,5,10,25), gamma = c(.001, .005,.01,1,2,3,5,10,25,50)), data = dat)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost gamma
## 10 0.005
##
## - best performance: 0.26
rad <- svm_call(kernel = "radial", cost = 2, gamma = .01, data = train)
rad_pred <- predict(rad, test)
table(rad_pred, test$Y)
##
## rad_pred -1 1
## -1 15 2
## 1 12 21
We are now asked to develop a nonlinear decision boundary through logistic regression.
The plot shows we clearly need two hyperplanes, since the class is determined by a difference of squares, resulting in decision boundaries with slopes \(y = \pm{x}\)
dat <- tibble(x1 = runif(500) - 0.5, x2 = runif(500) -0.5, Y = as.factor(ifelse(x1^2-x2^2 > 0, 1, -1)))
dat %>% ggplot(aes(x = x1, y = x2, col = Y))+
geom_point() +
geom_abline(slope = c(-1, 1), intercept = 0)
The decision boundary from a logistic model is obviously linear, and obviously wrong.
inds <- sample(500,250, replace = FALSE)
train <- dat[inds,]
test <- dat[-inds,]
log_mod <- glm(Y ~., data = dat, family = "binomial")
summary(log_mod)
##
## Call:
## glm(formula = Y ~ ., family = "binomial", data = dat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.320 -1.183 1.044 1.148 1.292
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.03137 0.08988 0.349 0.727
## x1 0.45927 0.30412 1.510 0.131
## x2 -0.17189 0.31068 -0.553 0.580
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 693.08 on 499 degrees of freedom
## Residual deviance: 690.41 on 497 degrees of freedom
## AIC: 696.41
##
## Number of Fisher Scoring iterations: 3
train_pred <- predict(log_mod, train) %>%
{if_else(. <=0, -1, 1)}
train %>% mutate(train_pred = train_pred) %>%
ggplot(aes(x = x1, y = x2, col = factor(train_pred))) +
geom_point()
I cheat shamelessly by using an absolute value transformation, approximating the function applied to the data. The decision boundary is plainly nonlinear.
log_poly <- glm(dat = train, Y~abs(x1) + abs(x2), family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(log_poly)
##
## Call:
## glm(formula = Y ~ abs(x1) + abs(x2), family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -5.134e-04 -2.000e-08 -2.000e-08 2.000e-08 4.474e-04
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -34.05 5585.62 -0.006 0.995
## abs(x1) 4442.10 334359.29 0.013 0.989
## abs(x2) -4361.64 329201.22 -0.013 0.989
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3.4643e+02 on 249 degrees of freedom
## Residual deviance: 5.0239e-07 on 247 degrees of freedom
## AIC: 6
##
## Number of Fisher Scoring iterations: 25
train_pred2 <- predict(log_poly, train) %>%
{if_else(. <=0, -1, 1)}
train %>% mutate(train_pred = train_pred2) %>%
ggplot(aes(x = x1, y = x2, col = factor(train_pred))) +
geom_point()
Now for an SVM. A simple polynomial model fails badly, classifying nearly every point as 1. Since both predictors are uniformly distributed with \(\mu=0\), this makes sense; a linear combination of squares would never be negative.
The linear model does no better.
svm_mod <- svm_call(data = train, kernel = "polynomial", degree = 2)
svm_linear <- svm_call(data = train, kernel = "linear")
svm_pred <- predict(svm_mod, test)
table(svm_pred, test$Y)
##
## svm_pred -1 1
## -1 119 131
## 1 0 0
test %>% mutate(svm_pred = svm_pred) %>%
ggplot(aes(x = x1, y = x2, col = svm_pred)) +
geom_point()
train %>% mutate(fitted = svm_mod$fitted) %>%
ggplot(aes(x = x1, y = x2, col = svm_pred)) +
geom_point()
train %>% mutate(fitted = svm_linear$fitted) %>%
ggplot(aes(x = x1, y = x2, col = svm_pred)) +
geom_point()
I’m not sure anything can beat the absolute value model, since the true class depends on which predictor has a greater absolute value.
##7. This last problem sets us to work on Auto.
It seems models with cost close to default did best, suggesting wide margins were favored.
auto <- ISLR::Auto
auto$Y <- with(auto, ifelse(mpg > median(mpg), 1, 0))
costs <- expand.grid(1:9, c(10^(-4:1), 5, 10)) %>%
reduce(`*`)
auto_svm <- tune(svm, data = auto, ranges = list(cost = costs), Y~., kernel = "linear")
auto_svm$performances %>% arrange(error)
## cost error dispersion
## 1 0.7000 0.07457993 0.02493678
## 2 0.8000 0.07465282 0.02448779
## 3 0.5000 0.07465596 0.02591999
## 4 1.0000 0.07470399 0.02364995
## 5 0.6000 0.07474078 0.02567954
## 6 0.9000 0.07479536 0.02403630
## 7 0.4000 0.07480339 0.02633267
## 8 0.3000 0.07599905 0.02769283
## 9 0.2000 0.07669357 0.02866855
## 10 2.0000 0.07724646 0.02470995
## 11 0.1000 0.07806516 0.02888733
## 12 0.0900 0.07836531 0.02876927
## 13 0.0800 0.07867388 0.02874516
## 14 0.0700 0.07915595 0.02873414
## 15 3.0000 0.07955621 0.02619332
## 16 0.0600 0.07974930 0.02859040
## 17 0.0500 0.08022371 0.02845921
## 18 0.0400 0.08061956 0.02832170
## 19 0.0300 0.08120042 0.02791287
## 20 0.0200 0.08180348 0.02689920
## 21 4.0000 0.08189063 0.02803433
## 22 0.0100 0.08328451 0.02502373
## 23 0.0080 0.08346783 0.02408232
## 24 5.0000 0.08353442 0.02927000
## 25 5.0000 0.08353442 0.02927000
## 26 0.0090 0.08356568 0.02454552
## 27 0.0070 0.08358663 0.02345686
## 28 0.0060 0.08385087 0.02303372
## 29 0.0050 0.08458558 0.02271859
## 30 6.0000 0.08501264 0.02960830
## 31 0.0040 0.08553406 0.02199359
## 32 7.0000 0.08602654 0.03013767
## 33 0.0030 0.08672235 0.02127339
## 34 8.0000 0.08763960 0.03050868
## 35 0.0020 0.08861018 0.02040878
## 36 9.0000 0.08883145 0.03046519
## 37 10.0000 0.08961398 0.03037644
## 38 10.0000 0.08961398 0.03037644
## 39 10.0000 0.08961398 0.03037644
## 40 15.0000 0.09324130 0.03126257
## 41 0.0010 0.09327329 0.01822369
## 42 0.0009 0.09440446 0.01808444
## 43 0.0008 0.09602139 0.01781292
## 44 20.0000 0.09640049 0.03130665
## 45 20.0000 0.09640049 0.03130665
## 46 20.0000 0.09640049 0.03130665
## 47 0.0007 0.09778000 0.01746809
## 48 25.0000 0.09781597 0.03174504
## 49 30.0000 0.10001217 0.03308969
## 50 30.0000 0.10001217 0.03308969
## 51 30.0000 0.10001217 0.03308969
## 52 0.0006 0.10067583 0.01761787
## 53 35.0000 0.10132141 0.03313449
## 54 40.0000 0.10303381 0.03416239
## 55 40.0000 0.10303381 0.03416239
## 56 40.0000 0.10303381 0.03416239
## 57 45.0000 0.10448388 0.03507022
## 58 0.0005 0.10482663 0.01715515
## 59 50.0000 0.10537511 0.03511215
## 60 50.0000 0.10537511 0.03511215
## 61 60.0000 0.10716387 0.03559360
## 62 60.0000 0.10716387 0.03559360
## 63 70.0000 0.10882065 0.03648409
## 64 70.0000 0.10882065 0.03648409
## 65 80.0000 0.10933583 0.03674232
## 66 80.0000 0.10933583 0.03674232
## 67 90.0000 0.11016791 0.03793817
## 68 90.0000 0.11016791 0.03793817
## 69 0.0004 0.11142236 0.01621353
## 70 0.0003 0.13109338 0.01470929
## 71 0.0002 0.20336318 0.01941809
## 72 0.0001 0.32580316 0.01964479
Oddly, radial models have the highest error, declining as cost falls before increasing. Polynomial error is consistent across costs. Linear error varies inversely with cost. I supplied some very low cost values that led to excessive margins, I think.
Gamma has barely any effect on error rate. In isolation, it seems other parameters have little effect on error rate.
I finish off by plotting every pair of predictors, because I can. The decision boundaries are clearest with horsepower and weight.
gammas = c(1:0, seq(10, 50, by = 10))
rad <- tune(svm, kernel = "radial", data = auto, Y~ ., ranges = list(cost = costs, gamma = gammas))
poly <- tune(svm, kernel = "polynomial", data = auto, Y~ ., ranges = list(cost = costs, degree = 1:7))
list(Linear =auto_svm$performances, Polynomial =poly$performances, Radial = rad$performances) %>%
bind_rows(.id = "Type") %>%
ggplot(aes(x = log2(cost), y = error, col = Type)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
rad$performances %>% distinct(gamma, .keep_all = TRUE) %>% ggplot(aes(x = gamma, y = error)) +
geom_line()
poly$performances %>% distinct(degree, .keep_all = TRUE) %>%
ggplot(aes(x = degree, y = error)) +
geom_line()
map(combn(names(auto), m =2, simplify = FALSE), ~ggplot(auto, aes(x = !!sym(.x[1]), y = !!sym(.x[2]), col = as.factor(Y))) + geom_point())
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
##
## [[17]]
##
## [[18]]
##
## [[19]]
##
## [[20]]
##
## [[21]]
##
## [[22]]
##
## [[23]]
##
## [[24]]
##
## [[25]]
##
## [[26]]
##
## [[27]]
##
## [[28]]
##
## [[29]]
##
## [[30]]
##
## [[31]]
##
## [[32]]
##
## [[33]]
##
## [[34]]
##
## [[35]]
##
## [[36]]
##
## [[37]]
##
## [[38]]
##
## [[39]]
##
## [[40]]
##
## [[41]]
##
## [[42]]
##
## [[43]]
##
## [[44]]
##
## [[45]]